Downloading datasets

The Dataset was downloaded from Kaggle.com (https://www.kaggle.com/shahir/protein-data-set)

The dataset had 2 files 1. 141401 rows, 14 columns 2. 467304 rows, 5 columns

Running packages

knitr::opts_chunk$set(echo = TRUE)
library(knitr)
library(ggplot2)
library(readr)
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v purrr   0.3.2     v forcats 0.4.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rio)
library(dplyr)
library(stringr)
library(tableone)
library(visdat)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:rio':
## 
##     export
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Importing data (2files) and removing duplicates

pdb_data_seq1 <- read_csv("Data/pdb_data_seq.csv")
pdb_data_seq2 <- pdb_data_seq1[!duplicated(pdb_data_seq1$structureId), ]
pdb_data_no_dups <- read_csv("Data/pdb_data_no_dups.csv")

Joining the datasets and visualizing the structure of the data

PDB <- left_join(pdb_data_no_dups,pdb_data_seq2, by=c("structureId"))
a <- vis_dat(PDB, warn_large_data = FALSE)
a

Removing duplicated columns and renaming

PDB1 <- select(PDB, everything(), -macromoleculeType.y, -residueCount.y)
PDB2 <- rename(PDB1, macromoleculeType = macromoleculeType.x, residueCount=residueCount.x )

Arranging by publication year and changing datatype

PDB3 <- PDB2 %>% 
  arrange(publicationYear)
PDB4 <- PDB3[-1,]
PDB4 <- mutate(PDB4, experimentalTechnique=as.factor(experimentalTechnique))
PDB4 <- mutate(PDB4, classification=as.factor(classification))
PDB4 <- mutate(PDB4, publicationYear=as.factor(publicationYear))
PDB4 <- mutate(PDB4, macromoleculeType=as.factor(macromoleculeType))

Figure 1

Mising values in the publocation year column was removed

PDB5 <- dplyr::filter(PDB4, !is.na(publicationYear))
p3 <- ggplot(PDB5, aes(x= publicationYear))+
  geom_bar(aes(fill=experimentalTechnique),show.legend = T)+theme_classic()+
  theme(axis.text.x = element_text(angle = 90, size = 5), legend.position = c(0.4,0.8), legend.text=element_text(size=5), legend.key.size = unit(1, "mm"))+xlab("Year of Publication")
p3

Figure 2

Missing values in experimentalTechnique and resolution was removed

PDB5 <- dplyr::filter(PDB4, !is.na(experimentalTechnique))
PDB6 <- dplyr::filter(PDB5, !is.na(resolution))
p1 <- ggplot(PDB6, aes(x=experimentalTechnique, y=resolution))+
 geom_jitter(aes(colour=macromoleculeType), size=1, alpha=0.5)+theme_classic()+
 theme(axis.text.x = element_text(angle = 90, size = 7),legend.position = c(0.9,0.7), legend.text=element_text(size=5), legend.key.size = unit(1, "mm"))+ xlab('Experimental Technique')
p1 

Figure 3

Resolution vs molecular weight for structures published in the year 2017 Missing values for molecular weigh, resolution and macromolecu type was removed

PDB7 <- dplyr::filter(PDB4, resolution<=10)
PDB7 <- dplyr::filter(PDB7, publicationYear==2017)
PDB7 <- dplyr::filter(PDB7, !is.na(structureMolecularWeight))
PDB7 <- dplyr::filter(PDB7, !is.na(resolution))
PDB7 <- dplyr::filter(PDB7, !is.na(macromoleculeType))
p1 <- ggplot(PDB7, aes(x=structureMolecularWeight, y=resolution, label= classification))+
 geom_point(aes(colour=macromoleculeType), alpha=0.5)+
 theme(legend.position = c(0.4,0.8), legend.text=element_text(size=5), legend.key.size = unit(1, "mm")) + xlab ("Molecular Weight")
p1 <- ggplotly(p1, dynamicTicks = TRUE) %>%
  rangeslider(start=0) %>%
  layout(hovermode = "x")
p1